##Evaluation - Four metrics are calculated using the test set - \(\text{Accuracy}=\frac{\sum{\left(\text{Actual Label} = \text{Predicted Label}\right)}}{\text{Label Count}}\) - \(\text{Recall}=\frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}\) - \(\text{Precision}=\frac{\text{True Positives}}{\text{True Positives}+\text{False Positives}}\) - \(\text{F1}=\frac{2*(\text{Precision}*\text{Recall})}{\text{Precision}+\text{Recall}}\)
While there are a small number of threatened species with a large range, it is clear that Range is likely a strong predictor of Group status
A lower range predicts a higher likelihood of threatened or extinct grouping.
corrDF <- train %>% mutate(Range=ntile(Range, n=20))
corrDF %<>% mutate_at(c("Group","LF","GF","Biomes","Range","Habitat_degradation","Habitat_loss","IAS","Other","Over_exploitation","Pollution","Unknown"),factor)
corrDF <- corrDF %>% mutate(Range=ntile(Range, n=20))
corrplot::corrplot(DescTools::PairApply(corrDF,DescTools::CramerV), type='lower')data_train_AB <- data_train
data_train_AB <- data_train_AB[data_train_AB$label
!= '3',]
data_train_AB_resampled <- ovun.sample(label ~ .,
data = data_train_AB, method = "over",
N = 980, seed = 1)$data
data_train_AC <- data_train
data_train_AC <- data_train_AC[data_train_AC$label
!= '2',]
data_train_AC_resampled <- ovun.sample(label ~ .,
data = data_train_AC,
method = "over", N = 980,
seed = 1)$data
data_train_AB_2 <- data_train_AB_resampled[data_train_AB_resampled$label
== '2',]
data_train_AC_3 <- data_train_AC_resampled[data_train_AC_resampled$label
== '3',]
data_train_1 <- data_train_AB_resampled[data_train_AB_resampled$label
== '1',]
data_train_combined <- rbind(data_train_1, data_train_AB_2, data_train_AC_3)
cat("Group Counts Pre-Balancing: ",table(data_train$label),
"\nGroup Counts Post-Balancing: ",table(data_train_combined$label))Group Counts Pre-Balancing: 490 148 23
Group Counts Post-Balancing: 490 490 490
\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
Score
Accuracy 0.88
Recall 0.79
Precision 0.82
F1 0.81
features_train_1 <- as.data.frame(lapply(features_train,
function(x) {(x-min(x))/(max(x)-min(x))}))
features_test_1 <- as.data.frame(lapply(features_test,
function(x) {(x-min(x))/(max(x)-min(x))}))
data_train_1 <- features_train_1
data_train_1$label <- label
class_counts_1 <- table(data_train_1$label)
model_1 <- randomForest(x = data_train_1[-ncol(data_train_combined)],
y = as.factor(data_train_1$label), ntree = 2)
variable_importance_1 = importance(model_1)
pred_comb_1 <- predict(model_1, features_test_1)
accuracy <- sum(label_test == pred_comb_1) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_1)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
Score
Accuracy 0.40
Recall 0.53
Precision 0.41
F1 0.46
features_train_2 <- as.data.frame(lapply(features_train,
function(x) {(x - mean(x))/sd(x)}))
features_test_2 <- as.data.frame(lapply(features_test,
function(x) {(x - mean(x))/sd(x)}))
data_train_2 <- features_train_2
data_train_2$label <- label
class_counts_2 <- table(data_train_2$label)
model_2 <- randomForest(x = data_train_2[-ncol(data_train_combined)],
y = as.factor(data_train_2$label), ntree = 2)
variable_importance_2 = importance(model_2)
pred_comb_2 <- predict(model_2, features_test_2)
accuracy <- sum(label_test == pred_comb_2) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_2)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\max(|X_{old}|)}\)
Score
Accuracy 0.88
Recall 0.71
Precision 0.79
F1 0.75
features_train_3 <- as.data.frame(lapply(features_train,
function(x) {x / max(abs(x))}))
features_test_3 <- as.data.frame(lapply(features_test,
function(x) {x / max(abs(x))}))
data_train_3 <- features_train_3
data_train_3$label <- label
class_counts_3 <- table(data_train_3$label)
model_3 <- randomForest(x = data_train_3[-ncol(data_train_combined)],
y = as.factor(data_train_3$label),
ntree = 2)
variable_importance_3 = importance(model_3)
pred_comb_3 <- predict(model_3, features_test_3)
accuracy <- sum(label_test == pred_comb_3) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_3)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
Score
Accuracy 0.77
Recall 0.57
Precision 0.54
F1 0.56
features_train_4 <- as.data.frame(lapply(features_train,
function(x) {x / sum(abs(x))}))
features_test_4 <- as.data.frame(lapply(features_test,
function(x) {x / sum(abs(x))}))
data_train_4 <- features_train_4
data_train_4$label <- label
class_counts_4 <- table(data_train_4$label)
model_4 <- randomForest(x = data_train_4[-ncol(data_train_combined)],
y = as.factor(data_train_4$label), ntree = 2)
variable_importance_4 = importance(model_4)
pred_comb_4 <- predict(model_4, features_test_4)
accuracy <- sum(label_test == pred_comb_4) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_4)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
Score
Accuracy 0.75
Recall 0.43
Precision 0.49
F1 0.46
features_train_5 <- as.data.frame(lapply(features_train,
function(x) {x / sqrt(sum(x^2))}))
features_test_5 <- as.data.frame(lapply(features_test,
function(x) {x / sqrt(sum(x^2))}))
data_train_5 <- features_train_5
data_train_5$label <- label
class_counts_5 <- table(data_train_5$label)
model_5 <- randomForest(x = data_train_5[-ncol(data_train_combined)],
y = as.factor(data_train_5$label), ntree = 2)
variable_importance_5 = importance(model_5)
pred_comb_5 <- predict(model_5, features_test_5)
accuracy <- sum(label_test == pred_comb_5) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_5)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)n <- length(pred_comb_1)
final_pred <- rep(NA, n)
for(i in 1:n) {
preds <- c(pred_comb_1[i], pred_comb_2[i], pred_comb_3[i],
pred_comb_4[i], pred_comb_5[i])
final_pred[i] <- as.numeric(names(which.max(table(preds))))
}
importances_list <- list(variable_importance_1, variable_importance_2,
variable_importance_3, variable_importance_4,
variable_importance_5)
average_importance <- Reduce("+", importances_list) / length(importances_list)
print(average_importance)
accuracy <- sum(label_test == final_pred) / length(label_test)
print(paste('Accuracy of Voting method:', accuracy))
final_pred_factor <- as.factor(final_pred)
label_test_factor <- as.factor(label_test)
cm_vote <- confusionMatrix(final_pred_factor, label_test_factor)
sensitivity_class1 <- cm_vote$byClass["Class: 1", "Sensitivity"]
sensitivity_class2 <- cm_vote$byClass["Class: 2", "Sensitivity"]
sensitivity_class3 <- cm_vote$byClass["Class: 3", "Sensitivity"]
recall = (sensitivity_class1 + sensitivity_class2 + sensitivity_class3) / 3
print(paste('Recall :', recall))
precision_class1 <- cm_vote$byClass["Class: 1", "Pos Pred Value"]
precision_class2 <- cm_vote$byClass["Class: 2", "Pos Pred Value"]
precision_class3 <- cm_vote$byClass["Class: 3", "Pos Pred Value"]
precision = (precision_class1 + precision_class2 + precision_class3) / 3
print(paste('Precision :', precision))
F1 = 2 * recall * precision / ( recall + precision )
print(paste('F1 :', F1))Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 200 12 1
2 10 51 0
3 0 0 9
Overall Statistics
Accuracy : 0.9187
95% CI : (0.8805, 0.9478)
No Information Rate : 0.742
P-Value [Acc > NIR] : 3.035e-14
Kappa : 0.7929
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 0.9524 0.8095 0.90000
Specificity 0.8219 0.9545 1.00000
Pos Pred Value 0.9390 0.8361 1.00000
Neg Pred Value 0.8571 0.9459 0.99635
Prevalence 0.7420 0.2226 0.03534
Detection Rate 0.7067 0.1802 0.03180
Detection Prevalence 0.7527 0.2155 0.03180
Balanced Accuracy 0.8871 0.8820 0.95000
cm_d <- as.data.frame(cm$table)
cm_st <-data.frame(cm$overall)
cm_st$cm.overall <- round(cm_st$cm.overall,2)
cm_d$diag <- cm_d$Prediction == cm_d$Reference
cm_d$ndiag <- cm_d$Prediction != cm_d$Reference
cm_d[cm_d == 0] <- NA
cm_d$Reference <- reverse.levels(cm_d$Reference)
cm_d$ref_freq <- cm_d$Freq * ifelse(is.na(cm_d$diag),-1,1)
plt1 <- ggplot(data = cm_d, aes(x = Prediction , y = Reference,
fill = Freq))+
scale_x_discrete(position = "top") +
geom_tile( data = cm_d,aes(fill = ref_freq)) +
scale_fill_gradient2(guide = FALSE ,low="red",high="mediumvioletred",
mid= "mistyrose",
midpoint = 0,na.value = 'white') +
geom_text(aes(label = Freq), color = 'black', size = 3)+
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position = "none",
panel.border = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
)
plt2 <- tableGrob(cm_st)
grid.arrange(plt1, plt2, nrow = 1, ncol = 2,
top=textGrob("Confusion Matrix",gp=gpar(fontsize=25,font=1)))Random Forest is a powerful and flexible machine learning algorithm that can be used for a wide range of tasks. It is particularly useful when dealing with complex data composed of a large number of features, and when the goal is to achieve high predictive accuracy while avoiding overfitting. The algorithm incorporates versatility in its capabilities for classification and regression tasks, handling missing data, and displaying robustness when faced with outliers and noisy data. Most extinctions were perennial shrubs found in the Cape Floristic Region. As range was the strongest predictor of extinction, many of the recorded taxa deemed susceptible were range-restricted. Habitat loss is presented as the second strongest variable of importance in predicting plant extinctions. Predictions were based on a quantitative, evidence-based approach, though gaps in knowledge highlighted areas for further study. Improved species monitoring and documentation of threat factors will aid in a deeper understanding of the ecological role and value of South African plant species.